####################################
### Analysis 11/2019
### supports revised paper from POP R&R
################################

library(readr)
library(readxl)
library(texreg)
library(dplyr)
library(tidyverse)
library(reshape2)
library(nnet)
library(ggplot2)
library(VennDiagram)
library(expss)
library(outreg)
library(stargazer)
library(writexl)

######################################################################
## Tables and Figures  
######################################################################

##change wd for easy export
#setwd("~/Dropbox/Wikipedia Study/jean_file/final_figures")
setwd("~/Dropbox/Wikipedia Study/PublicationProcess/Final Submission/")
full <- read.csv("~/Dropbox/Wikipedia Study/PublicationProcess/Final Submission/Codebook&Code/Alter_et_al_final_dataset.csv", stringsAsFactors=FALSE, na.strings = "NA")

# copy the log to a text file
sink(file = "./alter_log_figures.txt", append = FALSE, type = c("output", "message"),
     split = T)

#how many departments do we have:
nrow(table(full$b_institution)) #320

## Figure 1: numbers used to generate bar graph -- note that graph here does not exactly generate the graph in the paper (ugly version)
full$b_positioncat<-factor(full$b_positioncat, levels=c("Assistant", "Associate", "Professor", "Emeritus", "Non-TT"))
table(full[full$b_baseline==1,]$b_positioncat, full[full$b_baseline==1,]$b_typeinstitution) 

fig1<-full %>% 
  filter(!is.na(b_positioncat)) %>%
  group_by(b_positioncat, b_typeinstitution) %>%
  tally() %>% 
  ungroup %>%
  group_by(b_positioncat) %>% 
  ggplot(aes(x=b_positioncat, y=n, fill = b_typeinstitution,
             label = prettyNum(n,big.mark = ","))) +
  geom_bar(stat = "identity",color='black',width=0.9) +
  geom_text(size = 3, position = position_stack(vjust = 0.5),
            color="white", fontface="bold") + 
  scale_y_continuous(labels = scales::comma) + 
  theme_bw() + scale_fill_grey(name = "Institution Type") +
  labs(caption = "Source: Alter et al. Baseline dataset, \n Number of individuals, n=6696", x="Academic Rank", y="Number of Individuals") + 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(), legend.position="bottom")   # get rid of vertical grid lines at least
ggsave(fig1, file = 'FiguresTF/alter_fig1.tiff', width = 6.5, dpi=900)
ggsave(fig1, file = 'FiguresPNG/alter_fig1.png', width = 6.5, dpi=900)
fig1 


##Figure 2: numbers used to generate bar graph
genderTab<- table(full[full$b_baseline==1,]$b_positioncat, full[full$b_baseline==1,]$gender) 
options(digits=3)
pt_gt<-melt(prop.table(genderTab, 1), id=c(0,1))
pt_gt2<-melt(genderTab, id=c(0,1))
pt_gt<-cbind(pt_gt, pt_gt2[,3])
names(pt_gt)<-c("Position", "Gender", "Proportion", "Num")
pt_gt$Position<-factor(pt_gt$Position, levels=c("Assistant", "Associate", "Professor", "Emeritus", "Non-TT"))
pt_gt$Gender<-factor(pt_gt$Gender, levels=c(1, 0), labels=c("Men", "Women"))


fig2 <- ggplot(pt_gt, aes(x=Position, y=Proportion, fill = factor(Gender))) +
  geom_bar(position = "fill", stat = "identity",color='black',width=0.9) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, by = 0.10) )  + 
  theme_bw() + scale_fill_manual( values=c("grey", "grey28"), name = "Gender") +
  labs(caption = "Source: Alter et al. Baseline dataset, \n Number of individuals, n=6696", x="Academic Rank", y="Percent of Individuals") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom' )  + # get rid of vertical grid lines at least
  geom_hline(yintercept=0.28, linetype="dashed", color="white", size=1.2) + 
  geom_text(aes( label=Num), position=position_stack( vjust=0.25),
            color="white", fontface="bold") #use positions to plot labels
ggsave(fig2, file = 'FiguresTF/alter_fig2.tiff', width = 6, dpi=900)
ggsave(fig2, file = 'FiguresPNG/alter_fig2.png', width = 6, dpi=900)
fig2

## Table 1 Status Categorizations  
full <- full %>% mutate( 
  s_HonorLec=s_honors+s_lectures,
  s_Leader=s_Pres+s_ED,
  sec_pos_total= al_num_section_positions+ s_num_section_pos) 
t1_row <- full %>% gather(Status, Sum, num_status_1_wo_sec, sec_pos_total, num_status_2, s_HonorLec, s_Leader )
t1_totals <- t1_row  %>% group_by(Status) %>% summarize(lineTotal=sum(Sum, na.rm=T)) 
t1_totals  

#secondary values in the table
table(full$num_status_1_wo_sec>0)
table(full$sec_pos_total>0)
table(full$num_status_1_wo_sec>0|full$sec_pos_total>0)
table(full$num_status_2>0)
table(full$s_Pres>0 | full$s_ED>0 )
table(full$s_lectures>0 | full$s_honors>0)
table(full$sec_pos_total>0  & full$num_status_1_wo_sec==0 & full$num_status_2==0 & full$num_status_3==0)
table(full$gender[full$sec_pos_total>0  & full$num_status_1_wo_sec==0 & full$num_status_2==0 & full$num_status_3==0])
table(full$b_baseline[full$sec_pos_total>0  & full$num_status_1_wo_sec==0 & full$num_status_2==0 & full$num_status_3==0])

## Figure 3 
f3_row <- full  %>% gather(Status, Sum, num_status_1_wo_sec, sec_pos_total, num_status_2, s_HonorLec, s_Leader ) 
f3_totals <- f3_row %>% group_by(gender, Status) %>% summarize(lineTotal=sum(Sum, na.rm=T)) 
f3_final <- f3_totals %>% group_by(Status) %>% mutate(Status_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(100*lineTotal/Status_Total,2)))
f3_final

fig3 <- data.frame("Gender"= c("Women", "Men", "Total"),
                   "APSA/ISA Section (1)" = rep(NA, 3),
                   "Committee Member (1)" = rep(NA, 3),
                   "Officer (2)" = rep(NA, 3),
                   "Leader (3)" = rep(NA, 3),
                   "Honor (3)" = rep(NA, 3),
                   check.names=FALSE)


fig3[1,2] <- filter(f3_final, gender==0, Status=="sec_pos_total") %>% pull(lineTotal)
fig3[1,3] <- filter(f3_final, gender==0, Status=="num_status_1_wo_sec") %>% pull(lineTotal)
fig3[1,4] <- filter(f3_final, gender==0, Status=="num_status_2") %>% pull(lineTotal)
fig3[1,5] <- filter(f3_final, gender==0, Status=="s_Leader") %>% pull(lineTotal)
fig3[1,6] <- filter(f3_final, gender==0, Status=="s_HonorLec") %>% pull(lineTotal)
fig3[2,2] <- filter(f3_final, gender==1, Status=="sec_pos_total") %>% pull(lineTotal)
fig3[2,3] <- filter(f3_final, gender==1, Status=="num_status_1_wo_sec") %>% pull(lineTotal)
fig3[2,4] <- filter(f3_final, gender==1, Status=="num_status_2") %>% pull(lineTotal)
fig3[2,5] <- filter(f3_final, gender==1, Status=="s_Leader") %>% pull(lineTotal)
fig3[2,6] <- filter(f3_final, gender==1, Status=="s_HonorLec") %>% pull(lineTotal)
fig3[3,2] <- filter(f3_final, gender==1, Status=="sec_pos_total") %>% pull(Status_Total)
fig3[3,3] <- filter(f3_final, gender==1, Status=="num_status_1_wo_sec") %>% pull(Status_Total)
fig3[3,4] <- filter(f3_final, gender==1, Status=="num_status_2") %>% pull(Status_Total)
fig3[3,5] <- filter(f3_final, gender==1, Status=="s_Leader") %>% pull(Status_Total)
fig3[3,6] <- filter(f3_final, gender==1, Status=="s_HonorLec") %>% pull(Status_Total)
fig3

dfig3<-melt(fig3, id="Gender")
fig3n<-sum(dfig3$value)/2


fig3cap<-dfig3[c(3,6,9,12,15),3]
fig3cap<-c(  paste0("APSA/ISA Section (1) \n (n=", fig3cap[1], ")" ),
             paste0("Committee (1) \n (n=", fig3cap[2], ")" ),
             paste0("Officer (2) \n (n=", fig3cap[3], ")"),
             paste0("Leader (3) \n (n=", fig3cap[4], ")"),
             paste0("Honor (3) \n (n=", fig3cap[5], ")"))


fig3<-ggplot(dfig3[dfig3$Gender!="Total",], aes(x=variable, y=value, fill = (Gender))) +
  geom_bar(position = "fill", stat = "identity",color='black',width=0.9) +
  coord_flip()+theme_bw() +
  scale_y_continuous(breaks = seq(0, 1, by = 0.10), labels = scales::percent ) +
  scale_x_discrete(labels=fig3cap) +
  scale_fill_manual(values=c("grey", "grey28"), name="Gender") +
  labs(caption = paste0("Source: Alter et al. Status dataset, \n Observations n=", fig3n ), x="Status Categorization", y="Percent of Individuals") +
  theme(panel.grid.major.y = element_blank(), legend.position = 'bottom')  +  
  geom_hline(yintercept=0.28, linetype="dashed", color="white", size=0.75)  + 
  geom_text(aes(label = value, y=0.5), position=position_stack( vjust=0.22), 
            color="white", fontface="bold") #use positions to plot labels
ggsave(fig3, file = 'FiguresTF/alter_fig3.tiff', width = 7, dpi=900)
ggsave(fig3, file = 'FiguresPNG/alter_fig3.png', width = 7, dpi=900)
fig3

## Table 2
t2_row <- full  %>% gather(Status, Count, s_honors, s_lectures, s_Pres, s_ED, s_Assoc_ED) 
t2_totals <- t2_row %>% group_by(gender, Status) %>% summarize(lineTotal=sum(Count, na.rm=T)) 
t2_final <- t2_totals %>% group_by(Status) %>% mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(100*lineTotal/Status_Total,2),'%'))
t2_final

#honors
t2b_row <- full  %>% gather(Status, Count, s_x, s_x.1, s_x.2, s_x.3) 
t2b_totals <- t2b_row %>% group_by(gender) %>% summarize(lineTotal=sum(Count<=2000, na.rm=T)) 
t2b_final <- t2b_totals %>% mutate(Status_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(100*lineTotal/Status_Total,2),'%'))

#lectures
t2c_row <- full  %>% gather(Status, Count, s_x__2, s_x__3, s_x__4, s_x__5,s_x__6,s_x__7,s_x__8,s_x__9) 
t2c_totals <- t2c_row %>% group_by(gender) %>% summarize(lineTotal=sum(Count<=2000, na.rm=T)) 
t2c_final <- t2c_totals %>% mutate(Status_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(100*lineTotal/Status_Total,2),'%'))

#presidents
t2d_row <- full  %>% gather(Status, Count, s_apsa_president2, s_president.isa2) 
t2d_totals <- t2d_row %>% group_by(gender) %>% summarize(lineTotal=sum(Count<2001, na.rm=T)) 
t2d_final <- t2d_totals %>% mutate(Status_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(100*lineTotal/Status_Total,2),'%'))


t2 <- t(data.frame("Category"= c("% Women", "Total", "% Women (Pre 2001)", "Total"),
                   "Honor Society (3)" = rep(NA, 4),
                   "Named Prizes/Lectures (3)" = rep(NA, 4),
                   "APSA/ISA Presidents (3)" = rep(NA, 4),
                   "Editors (3)" = rep(NA, 4),
                   check.names=FALSE))


t2[2,1] <- filter(t2_final, gender==0, Status=="s_honors") %>% pull(Percent)
t2[3,1] <- filter(t2_final, gender==0, Status=="s_lectures") %>% pull(Percent)
t2[4,1] <- filter(t2_final, gender==0, Status=="s_Pres") %>% pull(Percent)
t2[5,1] <- filter(t2_final, gender==0, Status=="s_ED") %>% pull(Percent)
t2[2,2] <- filter(t2_final, gender==0, Status=="s_honors") %>% pull(Status_Total)
t2[3,2] <- filter(t2_final, gender==0, Status=="s_lectures") %>% pull(Status_Total)
t2[4,2] <- filter(t2_final, gender==0, Status=="s_Pres") %>% pull(Status_Total)
t2[5,2] <- filter(t2_final, gender==0, Status=="s_ED") %>% pull(Status_Total)


#partial data here bc of year from data
t2[2,3] <- filter(t2b_final, gender==0) %>% pull(Percent)
t2[3,3] <- filter(t2c_final, gender==0) %>% pull(Percent)
t2[4,3] <- filter(t2d_final, gender==0) %>% pull(Percent)

t2[2,4] <- filter(t2b_final, gender==0) %>% pull(Status_Total)
t2[3,4] <- filter(t2c_final, gender==0) %>% pull(Status_Total)
t2[4,4] <- filter(t2d_final, gender==0) %>% pull(Status_Total)

write.csv(t2, file='Tables/alter_table2.csv')




## Figure 4 

fig4_rowa <- full  %>% gather(Status, Sum, al_num_section_positions, s_num_section_pos) 
fig4_rowb <- full  %>% gather(Status, Count, s_apsa_council, s_apsa_vp, s_apsa_secretary, s_vp.1,
                              s_apsa_president2, s_president.isa2 ) 

fig4_totalsa <- fig4_rowa %>% group_by(gender, Status) %>% summarize(lineTotal=sum(Sum, na.rm=T)) 
fig4_finala <- fig4_totalsa %>% group_by(Status) %>% mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% 
  mutate(Percent=paste0(round(lineTotal/Status_Total,4)))

fig4_totalsb <- fig4_rowb %>% group_by(gender, Status) %>% summarize(lineTotal=sum(!is.na(Count), na.rm=T)) 
fig4_finalb <- fig4_totalsb %>% group_by(Status) %>% mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% 
  mutate(Percent=paste0(round(lineTotal/Status_Total,4)))

fig4_final<-rbind(fig4_finala, fig4_finalb)

fig4_final$gender <-factor(fig4_final$gender,  levels=c(0, 1), labels=c("Women", "Men"))
fig4_final<- fig4_final[!is.na(fig4_final$gender),]
fig4_final
fig4_final$Percent<-as.numeric(fig4_final$Percent)
fig4n<-sum(fig4_final[1:16, "lineTotal"])
fig4_final$group<-c(rep(c("Committee"), 4), rep(c("Committee", "Leader", "Officer", "Officer", "Leader", "Officer"),2))
fig4_final$group<- factor(fig4_final$group, levels=c("Committee", "Officer", "Leader"))


fig4_final$Status[fig4_final$Status=="al_num_section_positions"]<- "APSA Section Officer"	
fig4_final$Status[fig4_final$Status=="s_apsa_council"]<- "APSA Council"	
fig4_final$Status[fig4_final$Status=="s_apsa_secretary"]<- "APSA Secretary"	
fig4_final$Status[fig4_final$Status=="s_apsa_president2"]<- "APSA President"	
fig4_final$Status[fig4_final$Status=="s_apsa_vp"]<- "APSA Vice President"	
fig4_final$Status[fig4_final$Status=="s_vp.1"]<- "  ISA Vice President"	
fig4_final$Status[fig4_final$Status=="s_president.isa2"]<- "ISA President"	
fig4_final$Status[fig4_final$Status=="s_section.chair"]<- "ISA Section Chair"
fig4_final$Status[fig4_final$Status=="s_num_section_pos"]<- "ISA Section"
fig4_final$Status = str_wrap(fig4_final$Status, width = 10)

fig4<-ggplot(fig4_final, aes(x=Status, y=Percent, fill =fct_rev(gender))) +
  geom_bar(position = "stack", stat = "identity",color='black', width=0.9) +  
  facet_wrap(~group, nrow = 1, scales = "free")  +
  scale_y_continuous(breaks = seq(0, 1, by = 0.10), labels = scales::percent) +
  scale_fill_manual( values=c("grey", "grey28"), name = "Gender") + theme_bw() +
  labs(caption = paste("Source: Alter et al. Status dataset, \n Total Association Leadership (Observations), n=", fig4n, 
                       "\n Tallied by number of individuals who served the roles"), x="Position", y="Percent of Individuals") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom')  +  
  geom_hline(yintercept=0.28, linetype="dashed", color="white", size=1.2) + 
  geom_text(aes( label=lineTotal), position=position_stack( vjust=0.25),
            color="white", fontface="bold") 
ggsave(fig4, file = 'FiguresTF/alter_fig4.tiff', width = 7, dpi=900)
ggsave(fig4, file = 'FiguresPNG/alter_fig4.png', width = 7, dpi=900) 
fig4

##Figure 5
fig5_row <- full  %>% gather(Boards, Count, s_apsr_editorial.board,s_pt_executive.editorial.committee, s_associate.editors,
                           s_editorial.advisory.committee,s_editorial.board.1, s_editorial.board.10, s_editorial.board.11,
                           s_editorial.board.2, s_editorial.board.3, s_editorial.board.4, s_editorial.board.6, s_editorial.committee,
                           s_editorial.board.7, s_editorial.board.8, s_editorial.board.9, s_editorial.committee.1, s_pt_editorial.board)
fig5_totals <- fig5_row %>% group_by(gender, Boards) %>% summarize(lineTotal=sum(!is.na(Count)))
fig5_totals <-fig5_totals[1:34,]
fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_pt_executive.editorial.committee"] <-
  fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_pt_executive.editorial.committee"] + 
  fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_pt_editorial.board"]
fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_pt_executive.editorial.committee"] <-
  fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_pt_executive.editorial.committee"] + 
  fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_pt_editorial.board"]
fig5_totals<-fig5_totals[-c(16,33),]

fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_editorial.board.4"] <-
  fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_editorial.board.4"] + 
  fig5_totals$lineTotal[fig5_totals$gender==0 & fig5_totals$Boards=="s_editorial.committee"]
fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_editorial.board.4"] <-
  fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_editorial.board.4"] + 
  fig5_totals$lineTotal[fig5_totals$gender==1 & fig5_totals$Boards=="s_editorial.committee"]
fig5_totals<-fig5_totals[-c(14,30),]

fig5_final <- fig5_totals %>% group_by(Boards) %>% mutate(Bd_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=paste0(round(lineTotal/Bd_Total,4)))
fig5_final$Percent<-as.numeric(fig5_final$Percent)
fig5_final$gender <-factor(fig5_final$gender,  levels=c(0, 1), labels=c("Women", "Men"))
fig5n<-sum(fig5_final$Bd_Total/2)
#View(fig5_final)


#journal explan:
fig5_final$Boards[fig5_final$Boards=="s_apsr_editorial.board"]<- "APSR"	
fig5_final$Boards[fig5_final$Boards=="s_associate.editors"]<- "POP"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.advisory.committee"]<- "PB"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.1"]<- "IO"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.10"]<- "JOP"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.11"]<- "PA"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.2"]<- "CPS"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.3"]<- "AJPS"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.4"]<- "WP"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.6"]<- "JCR"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.7"]<- "POQ"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.8"]<- "Int'l Sec"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.board.9"]<- "APR"	
fig5_final$Boards[fig5_final$Boards=="s_editorial.committee.1"]<- "CP"	
fig5_final$Boards[fig5_final$Boards=="s_pt_executive.editorial.committee"]<- "PT"	



fig5<-ggplot(fig5_final, aes(x=fct_rev(as.factor(Boards)), y=Percent, fill =fct_rev(gender))) +
  geom_bar(position = "stack", stat = "identity",color='black', width=0.9) +  
  coord_flip() + scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, by = 0.10))+
  theme_bw() + scale_fill_manual( values=c("grey", "grey28"), name = "Gender") +
  labs(caption = paste0("Source: Alter et al. Status dataset, \n Total Editoral Board Members, n=", fig5n), x="Journal", y="Percent of Individuals") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom')  + # get rid of vertical grid lines at least
  geom_hline(yintercept=0.28, linetype="dashed", color="white", size=1.2) + 
  geom_text(aes( label=lineTotal), position=position_stack( vjust=0.25),
            color="white", fontface="bold") 
ggsave(fig5, file = 'FiguresTF/alter_fig5.tiff', width = 5, dpi=900) 
ggsave(fig5, file = 'FiguresPNG/alter_fig5.png', width = 5, dpi=900) 
fig5


# Figure 6
fig6_row <- full %>% gather(Eds, Sum, s_ED_board,s_ED,s_Assoc_ED )
fig6_totals <- fig6_row %>% group_by(gender, Eds) %>% summarize(lineTotal=sum(Sum))
fig6_final <- fig6_totals %>% group_by(Eds) %>% mutate(Ed_Total=sum(lineTotal)) %>% group_by(gender,add=TRUE) %>% mutate(Percent=round(lineTotal/Ed_Total,4))
fig6_final[1:6,]


fig6n<-sum(fig6_final[c(1,2,3),"Ed_Total"])
fig6_final$gender<-factor(fig6_final$gender, levels=c(0,1), labels=c("Women", "Men"))
fig6_final$Percent<-as.numeric(fig6_final$Percent)
fig6_final$Eds<-factor(fig6_final$Eds, levels=c("s_ED_board", "s_Assoc_ED", "s_ED"))

fig6cap<-fig6_final[c(1:3),4]
fig6cap<-c(  paste0("Editorial Board \n (n=", fig6cap[3,], ")"),
             paste0("Associate Editor \n (n=", fig6cap[1,], ")" ),
             paste0("Editor \n (n=", fig6cap[2,], ")" ))


fig6<-ggplot(fig6_final[1:6,], aes(x=Eds, y=Percent, fill =fct_rev(gender))) +
  geom_bar(position = "stack", stat = "identity",color='black',width=0.9) +  
  scale_x_discrete(labels=fig6cap)  +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, by = 0.10)) +
  theme_bw() + scale_fill_manual( values=c("grey", "grey28"), name = "Gender") +
  labs(caption = paste0("Source: Alter et al. Status dataset, \n Total Editoral Board Members n=", fig6n), x="Editorial Board Position", y="Percent of Individuals") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom')  + 
  #geom_hline(yintercept=28, linetype="dashed", color="darkgrey", size=1.2) + 
  geom_segment(aes(x = 0.5, y = 0.28, xend = 1.5, yend = 0.28,  color = "segment"), linetype="dashed", color="white" , size=0.9) +
  geom_segment(aes(x = 1.5, y = 0.23, xend = 3.5, yend = 0.23,  color = "segment"), linetype="dashed", color="white", size=0.9) +
  geom_text(aes( label=lineTotal), position=position_stack( vjust=0.25),
            color="white", fontface="bold")  
ggsave(fig6, file = 'FiguresTF/alter_fig6.tiff', width = 5.5, dpi=900) 
ggsave(fig6, file = 'FiguresPNG/alter_fig6.png', width = 5.5, dpi=900) 
fig6

## Figure 7--now includes people who ONLY have section service
grid.newpage()
fig7<-draw.quad.venn(area1 = nrow(subset(full, b_baseline == 1)), 
                      area2 = nrow(subset(full, status01sec == 1)), 
                      area3 = nrow(subset(full, tt_teele_thelen == 1)),
                      area4 = nrow(subset(full, kg_cite == 1)), 
                      n12 = nrow(subset(full, b_baseline == 1 & status01sec== 1)), 
                      n23 = nrow(subset(full, status01sec== 1 & tt_teele_thelen== 1)), 
                      n13 = nrow(subset(full, b_baseline == 1 & tt_teele_thelen== 1)), 
                      n123 = nrow(subset(full, b_baseline == 1 & status01sec== 1 & tt_teele_thelen== 1)), 
                      n14= nrow(subset(full, kg_cite == 1 & b_baseline==1)),
                      n24=nrow(subset(full, kg_cite == 1 & status01sec==1)),
                      n34=nrow(subset(full, kg_cite == 1 & tt_teele_thelen==1 )),
                      n124=nrow(subset(full, kg_cite == 1 & b_baseline==1 & status01sec==1)),
                      n134=nrow(subset(full, kg_cite == 1 & b_baseline==1 & tt_teele_thelen==1)),
                      n234=nrow(subset(full, kg_cite == 1 & status01sec==1 & tt_teele_thelen==1)),
                      n1234=nrow(subset(full, b_baseline == 1 & status01sec== 1 & tt_teele_thelen== 1 & kg_cite==1)),
                      category = c("Baseline dataset", "Status dataset", "Teele & Thelen", "KG Citation Data (Top 400)"), col = "gray40",
                      fill = c("gray45", "gray70", "gray99", "gray25"),  scaled=T, 
                      margin=c(rep(0.08)))
#note: only brought them in partially -- not 'in' status dataset (have NAs)
#table(full_long$Count[is.na(full_long$status_total)])

ggsave(fig7, file = 'FiguresTF/alter_fig7.tiff', width = 6.5, dpi=900)
ggsave(fig7, file = 'FiguresPNG/alter_fig7.png', width = 6.5, dpi=900)
fig7 

## Table 3 -- 
table(full$status01[ full$b_positioncat!="Non-TT"], full$gender[ full$b_positioncat!="Non-TT"])


t3_row_b <- full  %>% filter(full$b_positioncat!="Non-TT" ) %>%
  group_by(gender, status01) %>% summarize(lineTotal=length(gender)) %>% 
  mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% mutate(Percent=(round(100*lineTotal/Status_Total,2)))
t3_row_b

t3_row_ruvh <- full  %>% filter(full$b_typeinstitution=="RU/VH" & full$b_positioncat!="Non-TT" ) %>%
  group_by(gender, status01) %>% summarize(lineTotal=length(gender)) %>% 
  mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% mutate(Percent=(round(100*lineTotal/Status_Total,2)))
t3_row_ruvh



t3 <- data.frame("Alter et al. Baseline dataset"= c("Women", "Men", "Total", "Absence Rates \n (M-F)"),
                 "Missing from Status dataset \n all Baseline \n (excluding section service)" = rep(NA, 4),
                 "Missing from Status dataset \n RUVH (Baseline) \n (excluding section service)" = rep(NA, 4),
                 check.names=FALSE)


t3[1,2] <- paste0(filter(t3_row_b, gender==0, status01==0) %>% pull(Percent), '%', ' (n=',filter(t3_row_b, gender==0, status01==0) %>% pull(lineTotal) ,')' )
t3[2,2] <- paste0(filter(t3_row_b, gender==1, status01==0) %>% pull(Percent), '%', ' (n=', filter(t3_row_b, gender==1, status01==0) %>% pull(lineTotal), ')')
t3[3,2] <- paste0(round(100*sum(t3_row_b[c(1,3),3])/sum(t3_row_b[c(1,3),4]),2) , '%', ' (n=', sum(t3_row_b[c(1,3),3]), ')')
t3[4,2] <- paste0( round(-t3_row_b[1,5]+t3_row_b[3,5], 1), '%')

t3[1,3] <- paste0(filter(t3_row_ruvh, gender==0, status01==0) %>% pull(Percent), '%', ' (n=', filter(t3_row_ruvh, gender==0, status01==0) %>% pull(lineTotal), ')' )
t3[2,3] <- paste0( filter(t3_row_ruvh, gender==1, status01==0) %>% pull(Percent), '%', ' (n=', filter(t3_row_ruvh, gender==1, status01==0) %>% pull(lineTotal) , ')')
t3[3,3] <- paste0(round(100*sum(t3_row_ruvh[c(1,3),3])/sum(t3_row_ruvh[c(1,3),4]),2), '%', ' (n=', sum(t3_row_ruvh[c(1,3),3]), ')' )
t3[4,3] <- paste0(round(-t3_row_ruvh[1,5]+t3_row_ruvh[3,5], 1), '%')
t3

write.csv(t3, file='Tables/alter_table3.csv')



t3_row_b<-as.data.frame(t3_row_b)
t3_row_ruvh<-as.data.frame(t3_row_ruvh)
# #proportion test to see if numbers statistically different from expectation -- these contain TT people
#not statistically significant
prop.test( c(t3_row_b[1,3], t3_row_b[3,3] ), c(t3_row_b[1,4], t3_row_b[3,4]))  #test status membership by gender
prop.test( c(t3_row_ruvh[1,3], t3_row_ruvh[3,3] ), c(t3_row_ruvh[1,4], t3_row_ruvh[3,4]))  #test status sec ab by gender



######
#Table 4: Highest Level of Status Attained --excluding section service
t4_row2 <- full  %>% filter(full$b_baseline==1 & full$b_positioncat!="Non-TT" & full$b_typeinstitution=="RU/VH" & !is.na(status_3) & status_3>0) %>%
  group_by(gender, status_3) %>% summarize(lineTotal=length(gender)) %>% 
  mutate(Status_Total=sum(lineTotal)) %>% 
  group_by(gender,add=TRUE) %>% mutate(Percent=(round(100*lineTotal/Status_Total,1)))
t4_row2

t4_sum2<-t4_row2 %>% group_by(status_3) %>%  summarize(total=sum(lineTotal)) %>% 
  mutate(Status_Total=sum(total)) %>% 
  mutate(Percent=(round(100*total/Status_Total,1)))
t4_sum2

t4_2 <- data.frame("Alter et al. Baseline dataset"= c("Women", "Men", "Total", "Gender Representation \n Difference Rates \n (M-F)"),
                   "Committee Member (1)"= rep(NA, 4),
                   "Officer (2)" = rep(NA, 4),
                   "Leader/Honor (3)"= rep(NA, 4),
                   check.names=FALSE)



t4_2[1,2] <- paste0( filter(t4_row2, gender==0, status_3==1) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==0, status_3==1) %>% pull(lineTotal), ')' )
t4_2[2,2] <- paste0( filter(t4_row2, gender==1, status_3==1) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==1, status_3==1) %>% pull(lineTotal), ')' )
t4_2[3,2] <- paste0( filter(t4_sum2, status_3==1) %>% pull(Percent), '%', ' (n=', filter(t4_sum2, status_3==1) %>% pull(total), ')' )
t4_2[4,2] <- paste0( round(sum(t4_row2[4,5]- t4_row2[1,5]),2), '%' )

t4_2[1,3] <- paste0( filter(t4_row2, gender==0, status_3==2) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==0, status_3==2) %>% pull(lineTotal), ')' )
t4_2[2,3] <- paste0( filter(t4_row2, gender==1, status_3==2) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==1, status_3==2) %>% pull(lineTotal), ')' )
t4_2[3,3] <- paste0( filter(t4_sum2,  status_3==2) %>% pull(Percent), '%', ' (n=', filter(t4_sum2, status_3==2) %>% pull(total), ')' )
t4_2[4,3] <- paste0( sum(t4_row2[5,5]- t4_row2[2,5]), '%' )

t4_2[1,4] <- paste0( filter(t4_row2, gender==0, status_3==3) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==0, status_3==3) %>% pull(lineTotal), ')' )
t4_2[2,4] <- paste0( filter(t4_row2, gender==1, status_3==3) %>% pull(Percent), '%', ' (n=', filter(t4_row2, gender==1, status_3==3) %>% pull(lineTotal), ')' )
t4_2[3,4] <- paste0( filter(t4_sum2,  status_3==3) %>% pull(Percent), '%', ' (n=', filter(t4_sum2,  status_3==3) %>% pull(total), ')' )
t4_2[4,4] <- paste0( round(sum(t4_row2[6,5]- t4_row2[3,5]),2), '%' )

t4_2
write.csv(t4_2, file='Tables/alter_table4.csv')

t4_row2<-as.data.frame(t4_row2)
prop.test( c(t4_row2[1,3], t4_row2[4,3] ), c(t4_row2[1,4], t4_row2[4,4]))  #test status 1 by gender
prop.test( c(t4_row2[2,3], t4_row2[5,3] ), c(t4_row2[2,4], t4_row2[5,4]))  #test status 2 by gender
prop.test( c(t4_row2[3,3], t4_row2[6,3] ), c(t4_row2[3,4], t4_row2[6,4]))  #test status 3 by gender
 

#### Figure 8: See relevant Stata do file
library(foreign)
write.dta(full, "Alter_et_al_final_dataset.dta")



#figure 9: names with 4 or more status points
fig9b<-full%>% select(s_name, gender, status_total, sec_pos_total, Count)   %>% 
  select(s_name, gender, status_total, sec_pos_total, Count) %>% 
  filter( status_total > 3) %>% arrange(desc(status_total))


fig9b$status_points <-cut(fig9b$status_total,breaks=c(3,5,8,14,22), labels = c("4-5", "6-8", "9-14", "15-22"))
fig9n<-sum(fig9b$status_total>3)

fig9_f<-fig9b %>% group_by(gender, status_points) %>%  summarize(lineTotal=length(gender)) %>% 
  mutate(Status_Total=sum(lineTotal))

fig9_f<-fig9_f %>% group_by(status_points) %>%  mutate(Status_Total_g=sum(lineTotal))

fig9_f$pct<-paste0(100*round(fig9_f$lineTotal/fig9_f$Status_Total_g, digits=2), '%')

fig9<- ggplot(fig9_f, aes(x=status_points, y=lineTotal, fill = factor(gender, levels=c(1, 0), labels=c("Men", "Women")))) +
  geom_bar(position = "stack", stat = "identity",color='black',width=0.9) + 
  scale_y_continuous(breaks = seq(0, 200, by = 20)) + coord_flip() +
  theme_bw() + scale_fill_manual( values=c("grey", "grey28"), name = "Gender") +
  labs(caption = paste0("Source: Alter et al. Status dataset, \n Individuals with Status of 4+ points, n=", fig9n), x="Total Status Points", y="Number of Individuals") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom')  + 
  geom_text(aes( label=paste0(lineTotal, "\n")), position=position_stack( vjust=0.4),
            color="white", fontface="bold", size=4) +
  geom_text(aes( label=paste0("\n", '(', pct, ')')), position=position_stack( vjust=0.4),
          color="white", fontface="bold", size=2)
ggsave(fig9, file = 'FiguresTF/alter_fig9.tiff', width = 6, height=7, dpi=900) 
ggsave(fig9, file = 'FiguresPNG/alter_fig9.png', width = 6, height=7, dpi=900) 
fig9

## Figure 10: USES ADDITIONAL SUPPLEMENTAL DATA/INFORMATION
kg_status <- read_excel("Codebook&Code/KG_Status.xlsx", sheet = "Status Ranking", range = "A1:G397")
kg_status$cite_count  <- kg_status$Count
kg_status$cite_count[kg_status$Count >=40000] <- 40000
kg_status$Gender<-factor(kg_status$gender, levels=c(0, 1), labels=c("Women", "Men"))
fig10n<-sum(!is.na(kg_status$cite_count) & kg_status$cite_count>0) 

fig10 <- ggplot(kg_status, aes(x=cite_count, y=status_total, shape=Gender, color=Gender, size=Gender)) +
  geom_point() +    theme_bw() +
  labs(caption = paste0("Sources: Alter et al. Status dataset & KG dataset, \n Individuals with Status and KG data, n=", fig10n), 
       x="Citation Count (KG data)", y="Total Status Points") + 
  theme(panel.grid.major.x = element_blank(),  panel.grid.minor.x = element_blank(), legend.position = 'bottom') +
  scale_color_manual(values = c("black", "grey")) +
  scale_shape_manual(values=c(17, 16))+
  scale_size_manual(values=c(3,2)) +
  annotation_custom(textGrob("+", gp = gpar(fontsize = 8)),
                    xmin=40000, xmax=42350,ymin=2.5, ymax=2.55)  + coord_cartesian(ylim=c(4,23), clip="off")
ggsave(fig10, file = 'FiguresTF/alter_fig10.tiff', width = 8, height=6, dpi=900) 
ggsave(fig10, file = 'FiguresPNG/alter_fig10.png', width = 8, height=6, dpi=900) 
fig10

#overlap in our and KG dataset
nrow(table(full$Count[full$status_total>3], useNA = "ifany")) #139
nrow(table(kg_status$Count[kg_status$status_total>3], useNA="ifany")) #262 -- 262 are present in both

##################################################################
###WARNING: CALLS FROM OTHER TABLES SO BE SURE THEY ARE UNCHANGED
# Table 5: Centralized v Decentralized Selection Processes
t5 <- data.frame("table 5 **CHECK IF CHANGE table numbers/format"= c("Pres vs Honor Society", "Ed of prof assoc. journals vs ind journals", "Ed boards", "Tenureline vs Kim/Grofman"),
                 "% Women" = rep(NA, 4),
                 "Difference"= rep(NA, 4),
                 "% Women" = rep(NA, 4),
                 check.names=FALSE)

#row 1: apsa (pres, vp, secretary, council,) and ISA (president, vp) vs honor societies, named prizes, lectures
## v1: v2: s_HonorLec
t5ap<-sum(fig4_final[c( 6,8,7,5, 9, 10),3])
t5aps<-sum(fig4_final[c( 6,8,7,5, 9, 10),4])
t5[1,2] <- round((100* t5ap / t5aps), 1) 
t5[1,4] <- round(as.numeric(f3_final[3,5]), 1)
t5[1,3]<- round(t5[1,2]-t5[1,4],1)

#r1 prop test
prop.test( c(t5ap, as.numeric(f3_final[3,3]) ), c(t5aps, as.numeric(f3_final[3,4])))  #test status 0 by gender


#row 2: Editors: APSR, POP, AJPS, POQ, PA, ;
ced<- full  %>% gather(Pos, Count, s_apsr_co.editors,s_pop_lead.editor, s_editor,s_editor.3, s_editor.in.chief.2 ) %>%
  group_by(gender) %>% summarize(lineTotal=sum(!is.na(Count))) 
t5[2,2] <- round(100*ced[1,2]/sum(ced[,2]),1)

#Editors of independent journals: IS, APR, WP, PT, JOP, JCR, IO, CPS, CP, PB (IS only counting editor position only; JCR only editor position)
ded<- full  %>% gather(Pos, Count, "s_editor.5", "s_editor.6", "s_chair", "s_pt_editor", "s_chairman", "s_editor.in.chief.1",
                       "s_editor.7",   "s_editor.2", "s_editor.in.chief", "s_editors",
                       "s_editors.in.chief", "s_editor.4" ) %>%  group_by(gender) %>% summarize(lineTotal=sum(!is.na(Count)))
t5[2,4] <- round(100*ded[1,2]/sum(ded[,2]),1)
t5[2,3]<- round(t5[2,2]-t5[2,4],1)

#r2 prop test #not sig
prop.test( c(as.numeric(ced[1,2]), as.numeric(ded[1,2])) , c(sum(ced[,2]), sum(ded[,2])))   


#row 3: use figure 8 data Editorial: APSR, POP, AJPS, POQ, PA, ;
#Editorial boards of independent journals: IS, APR, WP, PT, JOP, JCR, IO, CPS, CP, PB
t5[3,2] <- round(sum(fig5_final[c(1,8,2,11,6),3])/sum(fig5_final[c(1,8,2,11,6),4])*100, 1)
t5[3,4] <- round(sum(fig5_final[c(3,4,5,7,9,10,12,13,14,15),3])/sum(fig5_final[c(3,4,5,7,9,10,12,13,14,15),4])*100, 1)
t5[3,3]<- round(t5[3,2]-t5[3,4],1)

#r3 proptest #not sig
prop.test( c(sum(fig5_final[c(1,8,2,11,6),3]), 
             sum(fig5_final[c(3,4,5,7,9,10,12,13,14,15),3])) , 
           c(sum(fig5_final[c(1,8,2,11,6),4]), 
             sum(fig5_final[c(3,4,5,7,9,10,12,13,14,15),4])))   


#row 4: baseline vs KG
bf<-as.data.frame(table(full$gender[full$b_baseline==1 & full$b_positioncat!="Non-TT" & full$b_positioncat!="Emeritus"]))
kg<-as.data.frame(table(full$gender[full$kg_cite==1])  )
t5[4,2] <- round(100*bf[1,2]/sum(bf[,2]),1)
t5[4,4] <- round(100*kg[1,2]/sum(kg[,2]),1)
t5[4,3] <- round(t5[4,2]-t5[4,4],1)

#r4 prop test  
prop.test( c(as.numeric(bf[1,2]), as.numeric(kg[1,2])) , c(sum(bf[,2]), sum(kg[,2])))   


t5
write.csv(t5, file='Tables/alter_table5.csv')


write.csv(full, file = "Codebook&Code/Alter_et_al_final_dataset.csv", quote = FALSE, row.names = F)
sink()


